import polars as pl
import sqlite3
import plotly.express as px
import missingno as msnoimport using polars (something new…)
jobs = pl.read_csv("lightcast_job_postings.csv")print(jobs.shape)(72476, 131)
print(jobs.columns)['ID', 'LAST_UPDATED_DATE', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'URL', 'ACTIVE_URLS', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME', 'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME', 'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'SOC_2', 'SOC_2_NAME', 'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME', 'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
schema = pl.DataFrame({
'Column': jobs.columns,
'Data Type': jobs.dtypes,
'Unique Values': [jobs.select(pl.col(col).n_unique()).item() for col in jobs.columns]
})
print(schema.shape)(131, 3)
print(schema.head(20))shape: (20, 3)
┌────────────────────────┬───────────┬───────────────┐
│ Column ┆ Data Type ┆ Unique Values │
│ --- ┆ --- ┆ --- │
│ str ┆ object ┆ i64 │
╞════════════════════════╪═══════════╪═══════════════╡
│ ID ┆ String ┆ 72476 │
│ LAST_UPDATED_DATE ┆ String ┆ 169 │
│ LAST_UPDATED_TIMESTAMP ┆ String ┆ 174 │
│ DUPLICATES ┆ Int64 ┆ 72 │
│ POSTED ┆ String ┆ 153 │
│ … ┆ … ┆ … │
│ MODELED_DURATION ┆ Int64 ┆ 61 │
│ COMPANY ┆ Int64 ┆ 12303 │
│ COMPANY_NAME ┆ String ┆ 12302 │
│ COMPANY_RAW ┆ String ┆ 17214 │
│ COMPANY_IS_STAFFING ┆ Boolean ┆ 2 │
└────────────────────────┴───────────┴───────────────┘
some cleaning up first (round 1)…
columns_to_drop = [
'ID', 'URL', 'ACTIVE_URLS', 'LAST_UPDATED_TIMESTAMP', 'DUPLICATES',
'NAICS2', 'NAICS2_NAME', 'NAICS3', 'NAICS3_NAME', 'NAICS4', 'NAICS4_NAME',
'NAICS5', 'NAICS5_NAME', 'NAICS6', 'NAICS6_NAME',
'SOC_2021_2', 'SOC_2021_2_NAME', 'SOC_2021_3', 'SOC_2021_3_NAME',
'SOC_2021_4', 'SOC_2021_4_NAME', 'SOC_2', 'SOC_2_NAME',
'SOC_3', 'SOC_3_NAME', 'SOC_4', 'SOC_4_NAME', 'SOC_5', 'SOC_5_NAME',
'NAICS_2022_2', 'NAICS_2022_2_NAME', 'NAICS_2022_3', 'NAICS_2022_3_NAME',
'NAICS_2022_4', 'NAICS_2022_4_NAME', 'NAICS_2022_5', 'NAICS_2022_5_NAME'
]
jobs_filtered = jobs.drop(columns_to_drop)
print(jobs_filtered.columns)['LAST_UPDATED_DATE', 'POSTED', 'EXPIRED', 'DURATION', 'SOURCE_TYPES', 'SOURCES', 'ACTIVE_SOURCES_INFO', 'TITLE_RAW', 'BODY', 'MODELED_EXPIRED', 'MODELED_DURATION', 'COMPANY', 'COMPANY_NAME', 'COMPANY_RAW', 'COMPANY_IS_STAFFING', 'EDUCATION_LEVELS', 'EDUCATION_LEVELS_NAME', 'MIN_EDULEVELS', 'MIN_EDULEVELS_NAME', 'MAX_EDULEVELS', 'MAX_EDULEVELS_NAME', 'EMPLOYMENT_TYPE', 'EMPLOYMENT_TYPE_NAME', 'MIN_YEARS_EXPERIENCE', 'MAX_YEARS_EXPERIENCE', 'IS_INTERNSHIP', 'SALARY', 'REMOTE_TYPE', 'REMOTE_TYPE_NAME', 'ORIGINAL_PAY_PERIOD', 'SALARY_TO', 'SALARY_FROM', 'LOCATION', 'CITY', 'CITY_NAME', 'COUNTY', 'COUNTY_NAME', 'MSA', 'MSA_NAME', 'STATE', 'STATE_NAME', 'COUNTY_OUTGOING', 'COUNTY_NAME_OUTGOING', 'COUNTY_INCOMING', 'COUNTY_NAME_INCOMING', 'MSA_OUTGOING', 'MSA_NAME_OUTGOING', 'MSA_INCOMING', 'MSA_NAME_INCOMING', 'TITLE', 'TITLE_NAME', 'TITLE_CLEAN', 'SKILLS', 'SKILLS_NAME', 'SPECIALIZED_SKILLS', 'SPECIALIZED_SKILLS_NAME', 'CERTIFICATIONS', 'CERTIFICATIONS_NAME', 'COMMON_SKILLS', 'COMMON_SKILLS_NAME', 'SOFTWARE_SKILLS', 'SOFTWARE_SKILLS_NAME', 'ONET', 'ONET_NAME', 'ONET_2019', 'ONET_2019_NAME', 'CIP6', 'CIP6_NAME', 'CIP4', 'CIP4_NAME', 'CIP2', 'CIP2_NAME', 'SOC_2021_5', 'SOC_2021_5_NAME', 'LOT_CAREER_AREA', 'LOT_CAREER_AREA_NAME', 'LOT_OCCUPATION', 'LOT_OCCUPATION_NAME', 'LOT_SPECIALIZED_OCCUPATION', 'LOT_SPECIALIZED_OCCUPATION_NAME', 'LOT_OCCUPATION_GROUP', 'LOT_OCCUPATION_GROUP_NAME', 'LOT_V6_SPECIALIZED_OCCUPATION', 'LOT_V6_SPECIALIZED_OCCUPATION_NAME', 'LOT_V6_OCCUPATION', 'LOT_V6_OCCUPATION_NAME', 'LOT_V6_OCCUPATION_GROUP', 'LOT_V6_OCCUPATION_GROUP_NAME', 'LOT_V6_CAREER_AREA', 'LOT_V6_CAREER_AREA_NAME', 'LIGHTCAST_SECTORS', 'LIGHTCAST_SECTORS_NAME', 'NAICS_2022_6', 'NAICS_2022_6_NAME']
jobs_filtered.shape(72476, 94)
nan_counts = jobs_filtered.null_count()
# looking at all columns
nan_counts_long = nan_counts.unpivot().rename({"variable": "Column", "value": "NaN Count"})
print(nan_counts_long.shape)(94, 2)
print(nan_counts_long.head(20))shape: (20, 2)
┌───────────────────────┬───────────┐
│ Column ┆ NaN Count │
│ --- ┆ --- │
│ str ┆ u32 │
╞═══════════════════════╪═══════════╡
│ LAST_UPDATED_DATE ┆ 0 │
│ POSTED ┆ 0 │
│ EXPIRED ┆ 7822 │
│ DURATION ┆ 27294 │
│ SOURCE_TYPES ┆ 0 │
│ … ┆ … │
│ EDUCATION_LEVELS ┆ 0 │
│ EDUCATION_LEVELS_NAME ┆ 0 │
│ MIN_EDULEVELS ┆ 0 │
│ MIN_EDULEVELS_NAME ┆ 0 │
│ MAX_EDULEVELS ┆ 56155 │
└───────────────────────┴───────────┘
# testing the msno heatmap
jobs_filtered_pd = jobs_filtered.to_pandas()
msno.heatmap(jobs_filtered_pd)# showing only columns with NaN values
nan_counts = jobs_filtered.null_count()
total_rows = jobs_filtered.height
nan_counts_filtered = (
nan_counts.unpivot()
.rename({"variable": "Column", "value": "NaN Count"})
.filter(pl.col("NaN Count") > 0)
.with_columns(
(pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
)
)
print(nan_counts_filtered.to_pandas()) Column NaN Count NaN Percentage
0 EXPIRED 7822 10.792538
1 DURATION 27294 37.659363
2 ACTIVE_SOURCES_INFO 64654 89.207462
3 TITLE_RAW 60 0.082786
4 MODELED_EXPIRED 15383 21.224957
5 MODELED_DURATION 19261 26.575694
6 COMPANY_RAW 497 0.685744
7 MAX_EDULEVELS 56155 77.480821
8 MAX_EDULEVELS_NAME 56155 77.480821
9 MIN_YEARS_EXPERIENCE 23113 31.890557
10 MAX_YEARS_EXPERIENCE 64046 88.368563
11 SALARY 41658 57.478338
12 ORIGINAL_PAY_PERIOD 40068 55.284508
13 SALARY_TO 40068 55.284508
14 SALARY_FROM 40068 55.284508
15 MSA 3908 5.392130
16 MSA_NAME 3908 5.392130
17 MSA_OUTGOING 3908 5.392130
18 MSA_NAME_OUTGOING 3908 5.392130
19 MSA_INCOMING 3921 5.410067
20 MSA_NAME_INCOMING 3921 5.410067
21 TITLE_CLEAN 96 0.132458
22 LIGHTCAST_SECTORS 54682 75.448424
23 LIGHTCAST_SECTORS_NAME 54682 75.448424
# visualize
nan_counts_filtered_pd = nan_counts_filtered.to_pandas()
fig = px.bar(
nan_counts_filtered_pd,
x="Column",
y="NaN Percentage",
color="NaN Percentage",
title="NaN Percentages per Column",
color_continuous_scale="Viridis"
)
fig.update_layout(width=1200, height=500, xaxis_tickangle=-45)
fig.show()a bit more cleaning up (round 2)…
# removing columns comprised of 50% or higher NaN values (except SALARY, SALARY_FROM, SALARY_TO)
columns_to_drop = [
"ACTIVE_SOURCES_INFO", "MAX_EDULEVELS", "MAX_EDULEVELS_NAME", "MAX_YEARS_EXPERIENCE",
"ORIGINAL_PAY_PERIOD", "LIGHTCAST_SECTORS", "LIGHTCAST_SECTORS_NAME"
]
jobs_filtered_2 = jobs_filtered.drop(columns_to_drop)print(jobs_filtered_2.shape)(72476, 87)
# and also remove duplicates
jobs_filtered_2 = jobs_filtered_2.unique(subset=["TITLE", "COMPANY", "LOCATION", "POSTED"], keep="first")print(jobs_filtered_2.shape)(69200, 87)
nan_counts = jobs_filtered_2.null_count()
total_rows = jobs_filtered_2.height
nan_counts_filtered = (
nan_counts.unpivot()
.rename({"variable": "Column", "value": "NaN Count"})
.filter(pl.col("NaN Count") > 0)
.with_columns(
(pl.col("NaN Count") / total_rows * 100).alias("NaN Percentage")
)
)
print(nan_counts_filtered.to_pandas()) Column NaN Count NaN Percentage
0 EXPIRED 7462 10.783237
1 DURATION 26092 37.705202
2 TITLE_RAW 54 0.078035
3 MODELED_EXPIRED 14739 21.299133
4 MODELED_DURATION 18401 26.591040
5 COMPANY_RAW 489 0.706647
6 MIN_YEARS_EXPERIENCE 22339 32.281792
7 SALARY 39954 57.736994
8 SALARY_TO 38490 55.621387
9 SALARY_FROM 38490 55.621387
10 MSA 2806 4.054913
11 MSA_NAME 2806 4.054913
12 MSA_OUTGOING 2806 4.054913
13 MSA_NAME_OUTGOING 2806 4.054913
14 MSA_INCOMING 2816 4.069364
15 MSA_NAME_INCOMING 2816 4.069364
16 TITLE_CLEAN 88 0.127168
# insight into remote jobs
remote_type_series = jobs_filtered_2.get_column("REMOTE_TYPE_NAME")
value_counts = remote_type_series.value_counts(sort=True)
print(value_counts)shape: (4, 2)
┌──────────────────┬───────┐
│ REMOTE_TYPE_NAME ┆ count │
│ --- ┆ --- │
│ str ┆ u32 │
╞══════════════════╪═══════╡
│ [None] ┆ 54211 │
│ Remote ┆ 11745 │
│ Hybrid Remote ┆ 2151 │
│ Not Remote ┆ 1093 │
└──────────────────┴───────┘
go ahead with a SQL connection…
conn = sqlite3.connect(':memory:')
conn.execute("DROP TABLE IF EXISTS jobs;")
columns = ", ".join([f"{col} TEXT" for col in jobs_filtered_2.columns])
create_table_query = f"CREATE TABLE jobs ({columns});"
conn.execute(create_table_query)
insert_query = f"INSERT INTO jobs VALUES ({', '.join(['?'] * len(jobs_filtered_2.columns))})"
conn.executemany(insert_query, jobs_filtered_2.to_numpy().tolist())
conn.commit()
print("Data from jobs_filtered_2 has been successfully inserted into the SQLite database.")Data from jobs_filtered_2 has been successfully inserted into the SQLite database.
query_count = """
SELECT COUNT(*) AS TotalCount
FROM jobs;
"""
cursor = conn.cursor()
cursor.execute(query_count)
result = cursor.fetchall()
total_count = pl.DataFrame(result, schema=["TotalCount"])
print(total_count)shape: (1, 1)
┌────────────┐
│ TotalCount │
│ --- │
│ i64 │
╞════════════╡
│ 69200 │
└────────────┘
by NAICS_2022_6_NAME
unique_naics_count = jobs_filtered_2.select(pl.col("NAICS_2022_6_NAME").n_unique()).to_numpy()
print(f"Unique NAICS_2022_6_NAME count: {unique_naics_count[0][0]}")Unique NAICS_2022_6_NAME count: 814
# We're interseted in the finance, marketing, and transportation industries
query_filtered_naics_count = """
SELECT COUNT(DISTINCT NAICS_2022_6_NAME) AS Unique_NAICS_Count
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_naics_count)
result = cursor.fetchall()
unique_filtered_naics_count_df = pl.DataFrame(result, schema=["Unique_NAICS_Count"])
print(unique_filtered_naics_count_df)shape: (1, 1)
┌────────────────────┐
│ Unique_NAICS_Count │
│ --- │
│ i64 │
╞════════════════════╡
│ 28 │
└────────────────────┘
pl.Config.set_tbl_rows(28)
query_filtered_naics = """
SELECT DISTINCT NAICS_2022_6_NAME
FROM jobs
WHERE NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_naics)
result = cursor.fetchall()
filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME"], orient="row")
print(filtered_naics_df.shape)(28, 1)
print(filtered_naics_df)shape: (28, 1)
┌─────────────────────────────────┐
│ NAICS_2022_6_NAME │
│ --- │
│ str │
╞═════════════════════════════════╡
│ Commercial Banking │
│ Scheduled Passenger Air Transp… │
│ Telemarketing Bureaus and Othe… │
│ Investment Banking and Securit… │
│ Marketing Consulting Services │
│ Freight Transportation Arrange… │
│ Regulation of Agricultural Mar… │
│ Marketing Research and Public … │
│ Special Needs Transportation │
│ Regulation and Administration … │
│ Nonscheduled Chartered Freight… │
│ Scenic and Sightseeing Transpo… │
│ Other Support Activities for A… │
│ Public Finance Activities │
│ Nonscheduled Chartered Passeng… │
│ Pipeline Transportation of Nat… │
│ School and Employee Bus Transp… │
│ Scheduled Freight Air Transpor… │
│ Pipeline Transportation of Cru… │
│ All Other Pipeline Transportat… │
│ Inland Water Freight Transport… │
│ Deep Sea Freight Transportatio… │
│ All Other Transit and Ground P… │
│ Pipeline Transportation of Ref… │
│ Transportation Equipment and S… │
│ Support Activities for Rail Tr… │
│ All Other Transportation Equip… │
│ Other Support Activities for R… │
└─────────────────────────────────┘
#drop any filtered NAICS_2022_6_NAME(s) without salary information
query_avg_salary_by_filtered_naics = """
SELECT
NAICS_2022_6_NAME,
COUNT(NAICS_2022_6_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_naics)
result = cursor.fetchall()
pl.Config.set_tbl_rows(28)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_naics_df = pl.DataFrame(result, schema=["NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_naics_df.shape)(23, 5)
# NOTE: drops 5
print(avg_salary_by_filtered_naics_df)shape: (23, 5)
┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐
│ NAICS_2022_6_NAME ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_From ┆ Avg_Salary_To │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ f64 ┆ f64 │
╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡
│ Commercial Banking ┆ 922 ┆ 127314.572668 ┆ 103130.91757 ┆ 151398.733189 │
│ Telemarketing Bureaus and Othe… ┆ 201 ┆ 90259.129353 ┆ 72430.199005 ┆ 107765.024876 │
│ Investment Banking and Securit… ┆ 94 ┆ 120806.776596 ┆ 95981.223404 ┆ 144876.382979 │
│ Marketing Consulting Services ┆ 48 ┆ 79475.604167 ┆ 69490.916667 ┆ 87989.833333 │
│ Regulation and Administration … ┆ 28 ┆ 93925.928571 ┆ 76147.75 ┆ 111066.857143 │
│ Scheduled Passenger Air Transp… ┆ 18 ┆ 121229.555556 ┆ 105449.777778 ┆ 137009.444444 │
│ Marketing Research and Public … ┆ 14 ┆ 90237.857143 ┆ 73339.285714 ┆ 107136.428571 │
│ Public Finance Activities ┆ 13 ┆ 108434.923077 ┆ 85725.0 ┆ 131145.153846 │
│ Regulation of Agricultural Mar… ┆ 11 ┆ 85541.909091 ┆ 73135.454545 ┆ 97948.636364 │
│ Freight Transportation Arrange… ┆ 10 ┆ 79420.3 ┆ 70311.1 ┆ 88529.8 │
│ Other Support Activities for A… ┆ 6 ┆ 82500.0 ┆ 71333.333333 ┆ 93666.666667 │
│ Nonscheduled Chartered Freight… ┆ 6 ┆ 108785.333333 ┆ 91843.166667 ┆ 125727.666667 │
│ Pipeline Transportation of Nat… ┆ 3 ┆ 98722.666667 ┆ 76333.333333 ┆ 120000.0 │
│ Special Needs Transportation ┆ 2 ┆ 62565.5 ┆ 57566.0 ┆ 65286.5 │
│ School and Employee Bus Transp… ┆ 2 ┆ 62650.0 ┆ 61090.0 ┆ 64210.0 │
│ Scenic and Sightseeing Transpo… ┆ 2 ┆ 80069.5 ┆ 67591.0 ┆ 92548.0 │
│ All Other Transit and Ground P… ┆ 2 ┆ 90675.0 ┆ 77500.0 ┆ 103850.0 │
│ Support Activities for Rail Tr… ┆ 1 ┆ 150000.0 ┆ 140000.0 ┆ 160000.0 │
│ Scheduled Freight Air Transpor… ┆ 1 ┆ 62400.0 ┆ 62400.0 ┆ 62400.0 │
│ Other Support Activities for R… ┆ 1 ┆ 55000.0 ┆ 50000.0 ┆ 60000.0 │
│ Nonscheduled Chartered Passeng… ┆ 1 ┆ 47008.0 ┆ 47008.0 ┆ 47008.0 │
│ Deep Sea Freight Transportatio… ┆ 1 ┆ 131400.0 ┆ 111300.0 ┆ 151500.0 │
│ All Other Transportation Equip… ┆ 1 ┆ 89523.0 ┆ 73299.0 ┆ 105747.0 │
└─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘
by TITLE_NAME
unique_title_count = jobs_filtered_2.select(pl.col("TITLE_NAME").n_unique()).to_numpy()
print(f"Unique TITLE_NAME count: {unique_title_count[0][0]}")Unique TITLE_NAME count: 5720
# We're interested in analyst positions
pl.Config.set_tbl_rows(5000)
query_filtered_title = """
SELECT DISTINCT TITLE_NAME
FROM jobs
WHERE TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%';
"""
cursor = conn.cursor()
cursor.execute(query_filtered_title)
result = cursor.fetchall()
filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME"], orient="row")
print(filtered_title_df.shape)(1476, 1)
print(filtered_title_df.head(20))shape: (20, 1)
┌────────────────────────────────┐
│ TITLE_NAME │
│ --- │
│ str │
╞════════════════════════════════╡
│ Data Analysts │
│ Oracle Analysts │
│ Data Reporting Analysts │
│ Data Management Analysts │
│ Health Data Analysts │
│ Lead Data Analysts │
│ Customer Experience Analysts │
│ Data Analytics Leads │
│ WFM Analysts │
│ ERP Business Analysts │
│ Business Intelligence Analysts │
│ Oracle Database Administrators │
│ Oracle Functional Analysts │
│ Principal Data Scientists │
│ Oracle Programmer Analysts │
│ Enterprise Data Architects │
│ Data and Reporting Analysts │
│ ERP Systems Analysts │
│ SAP Functional Analysts │
│ Fraud Analysts │
└────────────────────────────────┘
#drop any filtered TITLE_NAMES(s) without salary information
query_avg_salary_by_filtered_title = """
SELECT
TITLE_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
GROUP BY TITLE_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title)
result = cursor.fetchall()
pl.Config.set_tbl_rows(1500)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_df = pl.DataFrame(result, schema=["TITLE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_title_df.shape)(1005, 5)
# NOTE: drops 471
print(avg_salary_by_filtered_title_df.head(20))shape: (20, 5)
┌─────────────────────────────────┬─────────────┬───────────────┬─────────────────┬───────────────┐
│ TITLE_NAME ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_From ┆ Avg_Salary_To │
│ --- ┆ --- ┆ --- ┆ --- ┆ --- │
│ str ┆ i64 ┆ f64 ┆ f64 ┆ f64 │
╞═════════════════════════════════╪═════════════╪═══════════════╪═════════════════╪═══════════════╡
│ Data Analysts ┆ 3467 ┆ 93825.56533 ┆ 81035.9152 ┆ 106047.649841 │
│ Business Intelligence Analysts ┆ 938 ┆ 103024.716418 ┆ 87234.638593 ┆ 118629.098081 │
│ Data Analytics Engineers ┆ 325 ┆ 185512.461538 ┆ 161054.243077 ┆ 209111.68 │
│ Data and Reporting Analysts ┆ 290 ┆ 79633.92069 ┆ 66126.458621 ┆ 91923.089655 │
│ Data Governance Analysts ┆ 228 ┆ 111827.557018 ┆ 94360.548246 ┆ 128478.942982 │
│ Data Quality Analysts ┆ 193 ┆ 104164.165803 ┆ 91157.621762 ┆ 116710.108808 │
│ Data Analytics Analysts ┆ 192 ┆ 109418.296875 ┆ 87121.588542 ┆ 131655.473958 │
│ Data Management Analysts ┆ 178 ┆ 107976.398876 ┆ 92009.02809 ┆ 123313.651685 │
│ Data Modelers ┆ 172 ┆ 136975.895349 ┆ 118876.593023 ┆ 155075.238372 │
│ Lead Data Analysts ┆ 170 ┆ 111721.929412 ┆ 95335.964706 ┆ 127742.011765 │
│ Research Data Analysts ┆ 166 ┆ 81899.710843 ┆ 69225.656627 ┆ 94308.771084 │
│ IT Data Analytics Analysts ┆ 164 ┆ 107843.939024 ┆ 84157.25 ┆ 131339.04878 │
│ Lead Business Intelligence Ana… ┆ 151 ┆ 120744.960265 ┆ 100244.437086 ┆ 140527.245033 │
│ Data Science Analysts ┆ 133 ┆ 114863.992481 ┆ 96807.609023 ┆ 132389.849624 │
│ Data Analytics Leads ┆ 120 ┆ 162420.35 ┆ 120716.908333 ┆ 203772.808333 │
│ Data Operations Analysts ┆ 114 ┆ 88823.359649 ┆ 73918.745614 ┆ 102379.657895 │
│ Business Intelligence Data Ana… ┆ 113 ┆ 103099.938053 ┆ 87012.858407 ┆ 119073.752212 │
│ Health Data Analysts ┆ 107 ┆ 91458.654206 ┆ 74872.214953 ┆ 107990.401869 │
│ Data and Analytics Consultants ┆ 107 ┆ 127020.224299 ┆ 94460.691589 ┆ 158536.672897 │
│ Enterprise Data Architects ┆ 106 ┆ 166127.603774 ┆ 146011.613208 ┆ 186243.622642 │
└─────────────────────────────────┴─────────────┴───────────────┴─────────────────┴───────────────┘
now use both of the queries…
# Analyst positions within the Industries we're interested in
query_avg_salary_by_filtered_title_naics = """
SELECT
TITLE_NAME,
NAICS_2022_6_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY TITLE_NAME, NAICS_2022_6_NAME
ORDER BY Occurrences DESC;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics)
result = cursor.fetchall()
pl.Config.set_tbl_rows(250)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
print(avg_salary_by_filtered_title_naics_df.shape)(244, 6)
# 244 remaining
print(avg_salary_by_filtered_title_naics_df.head(20))shape: (20, 6)
┌─────────────────┬─────────────────┬─────────────┬───────────────┬────────────────┬───────────────┐
│ TITLE_NAME ┆ NAICS_2022_6_NA ┆ Occurrences ┆ Avg_Salary ┆ Avg_Salary_Fro ┆ Avg_Salary_To │
│ --- ┆ ME ┆ --- ┆ --- ┆ m ┆ --- │
│ str ┆ --- ┆ i64 ┆ f64 ┆ --- ┆ f64 │
│ ┆ str ┆ ┆ ┆ f64 ┆ │
╞═════════════════╪═════════════════╪═════════════╪═══════════════╪════════════════╪═══════════════╡
│ Data and ┆ Telemarketing ┆ 65 ┆ 81366.523077 ┆ 64625.384615 ┆ 97198.0 │
│ Reporting ┆ Bureaus and ┆ ┆ ┆ ┆ │
│ Analysts ┆ Othe… ┆ ┆ ┆ ┆ │
│ Health Data ┆ Telemarketing ┆ 51 ┆ 94717.647059 ┆ 75764.705882 ┆ 113670.588235 │
│ Analysts ┆ Bureaus and ┆ ┆ ┆ ┆ │
│ ┆ Othe… ┆ ┆ ┆ ┆ │
│ Data ┆ Commercial ┆ 41 ┆ 172009.756098 ┆ 137607.804878 ┆ 206411.707317 │
│ Integration ┆ Banking ┆ ┆ ┆ ┆ │
│ Leads ┆ ┆ ┆ ┆ ┆ │
│ Data Governance ┆ Commercial ┆ 35 ┆ 129052.342857 ┆ 103377.371429 ┆ 154727.314286 │
│ Analysts ┆ Banking ┆ ┆ ┆ ┆ │
│ Data Analysts ┆ Commercial ┆ 31 ┆ 105933.612903 ┆ 87030.903226 ┆ 124547.258065 │
│ ┆ Banking ┆ ┆ ┆ ┆ │
│ Crime Analysts ┆ Telemarketing ┆ 25 ┆ 46635.28 ┆ 44214.4 ┆ 48977.6 │
│ ┆ Bureaus and ┆ ┆ ┆ ┆ │
│ ┆ Othe… ┆ ┆ ┆ ┆ │
│ Lead Business ┆ Commercial ┆ 23 ┆ 113100.0 ┆ 86840.0 ┆ 139360.0 │
│ Intelligence ┆ Banking ┆ ┆ ┆ ┆ │
│ Ana… ┆ ┆ ┆ ┆ ┆ │
│ Data Management ┆ Commercial ┆ 20 ┆ 159859.65 ┆ 124353.0 ┆ 195366.3 │
│ Analysts ┆ Banking ┆ ┆ ┆ ┆ │
│ Business ┆ Commercial ┆ 19 ┆ 109138.526316 ┆ 88355.842105 ┆ 129921.210526 │
│ Intelligence ┆ Banking ┆ ┆ ┆ ┆ │
│ Analysts ┆ ┆ ┆ ┆ ┆ │
│ Data Quality ┆ Commercial ┆ 17 ┆ 107957.117647 ┆ 85142.588235 ┆ 130771.705882 │
│ Analysts ┆ Banking ┆ ┆ ┆ ┆ │
│ Data Analytics ┆ Commercial ┆ 16 ┆ 167973.3125 ┆ 134563.0 ┆ 201383.625 │
│ Leads ┆ Banking ┆ ┆ ┆ ┆ │
│ Enterprise Risk ┆ Commercial ┆ 16 ┆ 108521.25 ┆ 81160.4375 ┆ 135528.6875 │
│ Analysts ┆ Banking ┆ ┆ ┆ ┆ │
│ Data Analysts ┆ Regulation and ┆ 14 ┆ 93407.285714 ┆ 73797.285714 ┆ 113018.285714 │
│ ┆ Administration ┆ ┆ ┆ ┆ │
│ ┆ … ┆ ┆ ┆ ┆ │
│ Data Warehouse ┆ Commercial ┆ 14 ┆ 86984.714286 ┆ 66977.0 ┆ 106312.0 │
│ Business ┆ Banking ┆ ┆ ┆ ┆ │
│ Analys… ┆ ┆ ┆ ┆ ┆ │
│ Data Quality ┆ Commercial ┆ 13 ┆ 156542.230769 ┆ 129509.615385 ┆ 183574.846154 │
│ Leads ┆ Banking ┆ ┆ ┆ ┆ │
│ Business ┆ Investment ┆ 12 ┆ 125392.5 ┆ 92790.75 ┆ 157995.0 │
│ Intelligence ┆ Banking and ┆ ┆ ┆ ┆ │
│ Analysts ┆ Securit… ┆ ┆ ┆ ┆ │
│ Business ┆ Telemarketing ┆ 11 ┆ 107000.0 ┆ 84927.272727 ┆ 129072.727273 │
│ Intelligence ┆ Bureaus and ┆ ┆ ┆ ┆ │
│ Analysts ┆ Othe… ┆ ┆ ┆ ┆ │
│ Data Analysts ┆ Investment ┆ 10 ┆ 82393.0 ┆ 70166.0 ┆ 94620.0 │
│ ┆ Banking and ┆ ┆ ┆ ┆ │
│ ┆ Securit… ┆ ┆ ┆ ┆ │
│ Data Analytics ┆ Commercial ┆ 10 ┆ 101205.0 ┆ 88850.8 ┆ 113559.2 │
│ Analysts ┆ Banking ┆ ┆ ┆ ┆ │
│ Data ┆ Commercial ┆ 10 ┆ 115110.0 ┆ 92377.0 ┆ 137843.0 │
│ Integration ┆ Banking ┆ ┆ ┆ ┆ │
│ Analysts ┆ ┆ ┆ ┆ ┆ │
└─────────────────┴─────────────────┴─────────────┴───────────────┴────────────────┴───────────────┘
plot 1
avg_salary_by_filtered_title_naics_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()
top_20_df = avg_salary_by_filtered_title_naics_df_pd.head(20)
fig = px.bar(top_20_df,
x='TITLE_NAME',
y='Occurrences',
color='NAICS_2022_6_NAME',
title="Top 20 Job Titles by Occurrences and their Industry with Applied Filters",
labels={'TITLE_NAME': 'Job Title', 'Occurrences': 'Number of Occurrences'},
hover_data=['Avg_Salary', 'Avg_Salary_From', 'Avg_Salary_To'])
fig.update_layout(xaxis_tickangle=45)
fig.show()plot 2
query_avg_salary_by_filtered_title_naics_state = """
SELECT
TITLE_NAME,
NAICS_2022_6_NAME,
STATE_NAME,
COUNT(TITLE_NAME) AS Occurrences,
AVG(SALARY) AS Avg_Salary,
AVG(SALARY_FROM) AS Avg_Salary_From,
AVG(SALARY_TO) AS Avg_Salary_To
FROM jobs
WHERE SALARY IS NOT NULL AND SALARY_TO IS NOT NULL AND SALARY_FROM IS NOT NULL
AND (TITLE_NAME LIKE '%analyst%'
OR TITLE_NAME LIKE '%analytics%'
OR TITLE_NAME LIKE '%data%')
AND (NAICS_2022_6_NAME LIKE '%finance%'
OR NAICS_2022_6_NAME LIKE '%banking%'
OR NAICS_2022_6_NAME LIKE '%investing%'
OR NAICS_2022_6_NAME LIKE '%investments%'
OR NAICS_2022_6_NAME LIKE '%marketing%'
OR NAICS_2022_6_NAME LIKE '%transportation%')
GROUP BY STATE_NAME
ORDER BY Occurrences DESC
LIMIT 7;
"""
cursor = conn.cursor()
cursor.execute(query_avg_salary_by_filtered_title_naics_state)
result = cursor.fetchall()
pl.Config.set_tbl_rows(250)
pl.Config.set_tbl_cols(None)
avg_salary_by_filtered_title_naics_state_df = pl.DataFrame(result, schema=["TITLE_NAME", "NAICS_2022_6_NAME", "STATE_NAME", "Occurrences", "Avg_Salary", "Avg_Salary_From", "Avg_Salary_To"], orient="row")
avg_salary_by_filtered_title_naics_state_df_pd = avg_salary_by_filtered_title_naics_state_df.to_pandas()
fig = px.pie(avg_salary_by_filtered_title_naics_state_df_pd,
names='STATE_NAME',
values='Occurrences',
title="Top 7 States by Occurrences with Applied Filters",
color='STATE_NAME')
fig.show()plot 3
avg_salary_by_filtered_title_naics_df_pd = avg_salary_by_filtered_title_naics_df.to_pandas()
fig = px.treemap(
avg_salary_by_filtered_title_naics_df_pd,
path=['NAICS_2022_6_NAME', 'TITLE_NAME'],
values='Occurrences',
color='Avg_Salary',
color_continuous_scale='Viridis',
title='Analyst Positions within Selected Industries with Recorded Salary Information'
)
fig.update_layout(
margin=dict(t=50, l=25, r=25, b=25),
coloraxis_colorbar=dict(
title="Avg Salary",
tickprefix="$"
)
)
fig.show()conn.close()